0. Load data

# Load data
data <- readRDS("../../output/step_1/finalData.RDS")
dataMatrix <- readRDS("../../output/step_1/dataMatrix.RDS")
keyInfo <- readRDS("../../output/step_1/keyInfo.RDS")
metaInfo <- readRDS("../../output/step_1/metaInfo.RDS")

# Add 'Age' column
matchTable <- data.frame("age"=c("60", "44", "31", "25", "25", "43"), 
                         "SampleID"=c("105", "218", "261", "043", "160", "149"),
                         stringsAsFactors=FALSE)

keyInfo <- merge(keyInfo, matchTable, all.x=TRUE)

# Remove redundant data
rm(matchTable)

1. Perform a simple mean-difference hypothesis test between two groups.

# Perform t.test on data
ttest <- apply(dataMatrix, 
              1, 
              function(x) t.test(x[keyInfo$age < 35], x[keyInfo$age >= 35]))

# Obtain p-values
pValues <- data.frame(pValue = sapply(ttest, function(x) x$p.value),
                     effectSize = sapply(ttest, function(x) diff(x$estimate)))

2. Visualize top 5 most significant methylation positions (rows)

# Separate data in two groups by age
firstGroup <- keyInfo[keyInfo$age < 35, ]$Sample_Name
secondGroup <- keyInfo[keyInfo$age >= 35, ]$Sample_Name

# Obtain most significant 5
topRows <- pValues[order(pValues$pValue), ]
topRows <- head(topRows, 5)

# Obtain these rows from dataMatrix
topRowsData <- dataMatrix[match(rownames(topRows), rownames(dataMatrix)), 
                          order(keyInfo$age)]

# Change the column names so that classification can be done
for(i in 1:55) {
  
  if(colnames(topRowsData)[i] %in% firstGroup) {
    colnames(topRowsData)[i] <- "first"
  }
  else {
    colnames(topRowsData)[i] <- "second"
  }
  
}

# Generate plots
for(i in 1:5) {
  plot(
    topRowsData[i, ], 
    main = rownames(topRowsData)[i],
    ylab = "Methylation level", 
    col = as.factor(colnames(topRowsData))
  )
  
  legend("topright", 
         inset=.02, 
         title="Age", 
         c("< 35", ">= 35"), 
         fill=c("red", "black"), 
         horiz=TRUE, 
         cex=0.8)
}

# Remove redundant data
rm(firstGroup)
rm(secondGroup)
rm(topRows)
rm(i)

3. Provide a table with number of significant rows at the following levels

# Produce a table from dataframe
valuesTable <- data.frame(
  "levels"          = c("alpha = 0.1", "alpha = 0.05",
                        "alpha = 0.01", "FDR correction = 0.05", 
                        "Bonferroni correction = 0.05"),
    "significantRows" = c(sum(pValues$pValue < 0.1),
                        sum(pValues$pValue < 0.05),
                        sum(pValues$pValue < 0.01),
                        sum(p.adjust(pValues$pValue, 
                                    method = "fdr") < 0.05),
                        sum(p.adjust(pValues$pValue,
                                    method = "bonferroni") < 0.05)
                         )
)

# Generate the table
kable(valuesTable, "html") %>%
      kable_styling(font_size = 10) %>%
      kable_styling("striped") %>%
      scroll_box(width = "100%")
levels significantRows
alpha = 0.1 53307
alpha = 0.05 29068
alpha = 0.01 6159
FDR correction = 0.05 0
Bonferroni correction = 0.05 0

4. Visualize the histogram

hist(
  pValues$pValue,
  main = "p-values",
  xlab = "p-value",
  col = c("gray", "lightpink")
)

5. Visualize the volcano-plot

plot(
  pValues$effectSize,
  -log(pValues$pValue),
  main = "Volcano plot",
    xlab = "Effect size",
  ylab = "p-value log",
  col = c("gray", "lightpink")
)

6. Visualize a manhattan plot

plot(
  x = metaInfo$pos[metaInfo$chr == "chrX"],
  y = -log10(pValues$pValue[metaInfo$chr == "chrX"]),
  main = "Manhattan plot",
  xlab = "Chromosome X positions",
  ylab = "-log10",
  col = c("gray", "lightpink")
)

7. Perform the first step using a linear model ???